# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('data.csv', sep='\t')
data.head()
import datetime
# just converting dates to a standard format
release_dates = pd.to_datetime(data['Release Date'])
data['Release Date'] = release_dates
date_group = data.groupby([data['Release Date'].dt.year]).count()
date_group
plt.plot(date_group['Release Date'].index, date_group['HG Story Title'], marker='x')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.title('Stories released per year')
plt.grid(True)
plt.xticks(date_group['Release Date'].index, rotation=45)
plt.show()
Overall, the trend is increasing until a peak in 2018. Data for 2021 is incomplete.
genre_group = data[['Genre', 'HG Story Title']].groupby('Genre').count()
genre_group
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
fig = go.Figure(data=[go.Pie(labels=genre_group.index, values=genre_group['HG Story Title'])])
iplot(fig)
plt.figure(figsize=(10, 8))
plt.barh(genre_group.index, genre_group['HG Story Title'])
plt.xlabel('Count')
plt.ylabel('Genre')
plt.title('Stories released per genre')
plt.grid(True, axis='x')
plt.show()
Fantasy is the most common genre, followed by Supernatural and Sci-Fi.
data['Popularity'] = data['# of Omnibus Ratings'] + data['# of Google Reviews']
px.scatter(data, x='# of Omnibus Ratings', y='# of Google Reviews', hover_data=['HG Story Title'], trendline='ols')
This doesn't look so good but let's just calculate a correlation...
import statsmodels.api as sm
import statsmodels.formula.api as smf
It turns out that some of the games don't have Google review data. Which are those games?
data[data['# of Google Reviews'].isna()]
Alright, let's just remove those games from the data.
data_clean = data[~data['# of Google Reviews'].isna()]
results = sm.OLS(data_clean['# of Omnibus Ratings'], sm.add_constant(data_clean['# of Google Reviews'])).fit()
print(results.summary())
That's a lot of numbers! Here I am attempting to discern the relationship between
An important number here is R-squared. Basically, this shows the strength of the relationship between the two variables. It's 0.496, which is pretty decent. So there definitely is a relationship between the two popularity metrics. But I'm guessing the outlier kind of skews the results... What is the outlier, anyway? That seems to be by far the most popular game published by Hosted Games.
data_popular = data.sort_values('Popularity', ascending=False)
data_popular.head(10)
The outlier that we saw earlier was The Great Tournament, which is the most popular Hosted Game.
By most metrics, Philip Kempton's The Great Tournament and Life of a Mercenary are the two most popular Hosted Games. Congrats I guess.
results = sm.OLS(data_clean['Popularity'], sm.add_constant(data_clean['Word Count'])).fit()
print(results.summary())
px.scatter(data_clean, x='Word Count', y='Popularity', hover_data=['HG Story Title'], trendline='ols')
There's definitely a positive relationship between word count and popularity, but the R^2 between word count and popularity is about 0.08, which is... very weak.
What happens if we remove The Great Tournament?
data_no_outliers = data_clean[data_clean['HG Story Title'] != 'The Great Tournament']
results = sm.OLS(data_no_outliers['Popularity'], sm.add_constant(data_no_outliers['Word Count'])).fit()
print(results.summary())
Now, the R^2 improves to 0.193, which is... better?
The regression equation is $Popularity = 0.066*WordCount + 471.8$
This means that, for example, the expected popularity of a 500k-word story would be about 3800.
px.scatter(data_no_outliers, x='Word Count', y='Popularity', hover_data=['HG Story Title'], trendline='ols')
px.scatter(data_no_outliers, x='Word Count', y='Popularity', hover_data=['HG Story Title'], trendline='ols', log_x=True, log_y=True)
We might expect earlier released games to be more popular, just because they've had more time to acquire readers. But we might also expect later released games to be more popular, because of greater visibility for HG in more recent years, and the more recent release of the omnibus app. Or maybe there is no relation at all. Which hypothesis is true?
plt.scatter(data['Release Date'], data['Popularity'])
# okay that wasn't very helpful... but what is the outlier? (It's The Great Tournament)
px.scatter(data, x='Release Date', y='Popularity', hover_data=['HG Story Title'], trendline='ols')
It doesn't look like there's much of a relationship between popularity and release date.
Let's plot the average popularity per genre:
genre_pop_group = data_clean[['Genre', 'Popularity']].groupby('Genre').mean()
px.bar(genre_pop_group, x=genre_pop_group.index, y='Popularity')
#Let's do a box plot with error bars...
px.box(data_clean, x='Genre', y='Popularity', hover_data=['HG Story Title'])
# log popularity by genre
games_plot = px.box(data_clean, x='Genre', y='Popularity', hover_data=['HG Story Title'], log_y=True, points='all')
games_plot.show()
games_plot.write_html('./log_popularity_by_genre.html')
px.strip(data_clean, x='Genre', y='Popularity', hover_data=['HG Story Title'], log_y=True)
#Let's do a box plot with The Great Tournament removed...
px.box(data_no_outliers, x='Genre', y='Popularity', hover_data=['HG Story Title'])
Superatural, Fantasy, and Superhero are the most popular genres on average. They are also the most popular genres in total, so writers' interests and readers' interests seem to overlap for the most part. Interestingly, it seems that Superhero has the highest floor of popularity: the least popular Superhero game has a popularity of 419, which is the highest minimum popularity of any genre!
We'll introduce dummy variables for all of the genres, as well as "Free?".
# 1. Create categorical variables to represent genre inclusions
genre_cols = []
genre_cols_quoted = []
for genre in set(data['Genre']):
data['is_' + genre] = [int(x) for x in (data['Genre'] == genre)]
genre_cols_quoted.append('"is_' + genre + '"')
genre_cols.append('is_'+genre)
data['is_free'] = [int(x) for x in (data['Free?'] == 'Yup')]
data_clean = data[~data['# of Google Reviews'].isna()]
data_clean_X = data_clean[['Word Count', 'is_free'] + genre_cols]
data_no_outliers = data_clean[data_clean['HG Story Title'] != 'The Great Tournament']
data_noo_X = data_no_outliers[['Word Count', 'is_free'] + genre_cols]
results = sm.OLS(data_clean['Popularity'], sm.add_constant(data_clean_X)).fit()
print(results.summary())
Now, what the heck does this mean?
If we interpret the linear regression coeffients very literally...
None of the other conclusions are significant. Based on this analysis, we cannot conclusively say that, for example, puzzle or adventure or horror games will have a penalty to popularity, simply because there are too few games in those categories. Similarly, we can't say that school or post-apocalyptic games will have a bonus to popularity.
Now, what if we remove The Great Tournament?
results = sm.OLS(data_no_outliers['Popularity'], sm.add_constant(data_noo_X)).fit()
print(results.summary())
These results actually look quite a bit better. Here's a summary:
These are all the "significant" conclusions here. According to the regression coefficients, puzzle, adventure, sci-fi, horror, historical, and steampunk all have penalties, while war, school, and post-apocalyptic all have slight bonuses. But none of those are "significant", because the sample size is very, very small.
We can look at the residuals, which are the true value minus the predicted value. A positive residual indicates that a game was more popular than predicted, while a negative residual indicates that a game was less popular than predicted.
predictions = results.predict(sm.add_constant(data_noo_X))
px.scatter(data_no_outliers, x='Popularity', y=predictions, hover_data=['HG Story Title'], labels={'y': 'Predicted popularity'})
px.scatter(data_no_outliers, x='Popularity', y=results.resid, hover_data=['HG Story Title'], labels={'y': 'Residual'})
data_no_outliers['resid'] = results.resid
more_popular = data_no_outliers.sort_values('resid', ascending=False)
more_popular.head(10)
The games that are more popular than expected are, in order (not including The Great Tournament): Life of a Mercenary, Wayhaven 1, Wayhaven 2, Hero or Villain, Doomsday on Demand, Samurai of Hyuga, The Parenting Similator, War for the West, The Aether, and Evertree Inn.
less_popular = data_no_outliers.sort_values('resid', ascending=True)
less_popular.head(10)
The games that are less popular than expected are, in order: Tin Star, Magikiras, Gambling with Eternity, Elemental Saga, Burn(t), Seven Bullets, Twin Flames, Lost in the Pages, The Aegis Saga, and Best of Us.
What if we added data for secondary genres as well as primary genres?
# 1. Create categorical variables to represent genre inclusions
data_subgenres = data.copy()
genre_cols = []
genre_cols_quoted = []
for genre in set(data['Genre']):
data_subgenres['is_' + genre] = [int(x) for x in ((data['Genre'] == genre) | (data['Subgenre (if applicable)'] == genre))]
genre_cols_quoted.append('"is_' + genre + '"')
genre_cols.append('is_'+genre)
data_sg_clean = data_subgenres[~data_subgenres['# of Google Reviews'].isna()]
data_sg_clean_X = data_sg_clean[['Word Count', 'is_free'] + genre_cols]
data_sg_no_outliers = data_sg_clean[data_clean['HG Story Title'] != 'The Great Tournament']
data_sg_noo_X = data_sg_no_outliers[['Word Count', 'is_free'] + genre_cols]
results = sm.OLS(data_sg_no_outliers['Popularity'], sm.add_constant(data_noo_X)).fit()
print(results.summary())
Well, the $R^2$ is a bit higher here, at 0.363 vs 0.351 for the primary genre-only model. So that's good, indicating that the sub-genre provides some additional information. The coefficients for Word Count and is_free are pretty similar as before.
The interesting thing is that it totally changes the directions of some of the coefficients for the genres:
In order of highest to lowest predicted popularity gain, the genres are:
and these genres are predicted to cause popularity loss:
(* means that the effect of that genre is statistically significant at p<0.05.)
What if we tried to do a regression on log(Word Count) instead?
import numpy as np
data_no_outliers['log_WC'] = np.log2(data_no_outliers['Word Count'])
data_noo_X = data_no_outliers[['log_WC', 'is_free'] + genre_cols]
results_loglog = sm.OLS(np.log2(data_no_outliers['Popularity']), sm.add_constant(data_noo_X)).fit()
print(results_loglog.summary())
predictions = results_loglog.predict(sm.add_constant(data_noo_X))
predictions = 2**predictions
px.scatter(data_no_outliers, x='Popularity', y=predictions, hover_data=['HG Story Title'], labels={'y': 'Predictions'})
Is the log regression better than the linear regression? Hard to say. The errors are just as large, but at least there aren't any negative predictions.
Let's just try plotting the Google ratings vs the omnibus ratings.
px.scatter(data_clean, x='GPS Score', y='Omnibus Rating', trendline='ols', hover_data=['HG Story Title'],
title = 'Omnibus vs Google ratings')
Overall, the Omnibus rating and GPS rating correlate very well, with an $R^2$ of 0.7 in the above line.
The regression equation is $OmnibusRating = 0.477*GPS + 2.565$.
Let's plot GPS Score vs GPS ratings count, and Omnibus Rating vs Omnibus rating count.
px.scatter(data_clean, x='# of Google Reviews', y='GPS Score', trendline='lowess', hover_data=['HG Story Title'])
px.scatter(data_clean, x='# of Omnibus Ratings', y='Omnibus Rating', trendline='lowess', hover_data=['HG Story Title'])
From this, it's pretty clear that more popular games tend to have higher ratings on both platforms. Not going to try to do any regressions here.
Okay, we're going to do some more regressions. There's probably a better way to do this than OLS because ratings are in a pretty narrow range.
Let's consider the data without sub-genres first:
data_noo_X = data_no_outliers[['Word Count', 'is_free'] + genre_cols]
results = sm.OLS(data_no_outliers['Omnibus Rating'], sm.add_constant(data_noo_X)).fit()
print(results.summary())
Interpretation:
Genres sorted by rating bonuses:
Adventure Spy Fantasy Superhero War Supernatural Slice of Life Post-Apocalyptic Crime Horror Sci-Fi* Historical Romance Mystery Steampunk
And here are the predicted "negative" genres: Puzzle School Humor
Again, a * indicates statistical significance at p<0.05.
Interestingly, Adventure and Spy were some of the worst performing genres in terms of popularity. It's interesting that they seem to give ratings bonuses in the omnibus (but these are very small sample sizes).
Now, let's use the secondary genres:
data_sg_noo_X = data_sg_no_outliers[['Word Count', 'is_free'] + genre_cols]
results = sm.OLS(data_sg_no_outliers['Omnibus Rating'], sm.add_constant(data_sg_noo_X)).fit()
print(results.summary())
The R^2 here is slightly worse than only using the primary genre.
The trends with regard to genre are... a lot more unclear vs only using the primary genre. None of the results are significant at p<0.05.
genre_ratings_plot = px.box(data_clean, x='Genre', y='Omnibus Rating', hover_data=['HG Story Title'], points='all')
genre_ratings_plot.show()
genre_ratings_plot.write_html('genre_ratings_plot.html')
How are we defining "underrated" here? Let's just say, the games whose ratings are higher than expected given their popularity. I think we should do log-popularity because it looks kinda more linear.
px.scatter(data_clean, x='# of Omnibus Ratings', y='Omnibus Rating', trendline='lowess', hover_data=['HG Story Title'], log_x=True)
px.scatter(data_clean, x='# of Google Reviews', y='GPS Score', trendline='lowess', hover_data=['HG Story Title'], log_x=True)
Let's do a linear regression of the omnibus and GPS ratings as a function of log(review count).
omnibus_results = sm.OLS(data_no_outliers['Omnibus Rating'], sm.add_constant(np.log2(data_no_outliers['# of Omnibus Ratings']))).fit()
print(omnibus_results.summary())
gps_results = sm.OLS(data_no_outliers['GPS Score'], sm.add_constant(np.log2(data_no_outliers['# of Google Reviews']))).fit()
print(gps_results.summary())
omnibus_predictions = omnibus_results.predict(sm.add_constant(np.log2(data_no_outliers['# of Omnibus Ratings'])))
data_no_outliers['omnibus_rating_predictions'] = omnibus_predictions
data_no_outliers['omnibus_resid'] = omnibus_results.resid
gps_predictions = gps_results.predict(sm.add_constant(np.log2(data_no_outliers['# of Google Reviews'])))
data_no_outliers['gps_rating_predictions'] = gps_predictions
data_no_outliers['gps_resid'] = gps_results.resid
data_no_outliers.sort_values('omnibus_resid', ascending=False).head(10)
data_no_outliers.sort_values('gps_resid', ascending=False).head(10)